{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import brown\n",
    "from nltk.corpus import stopwords\n",
    "import dautil as dl\n",
    "from collections import defaultdict\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from IPython.display import HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "words_dict = dl.collect.IdDict()\n",
    "dd = defaultdict(int)\n",
    "fid = brown.fileids(categories='news')[0]\n",
    "words = brown.words(fid)\n",
    "sw = set(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for w in words:\n",
    "    if w in sw:\n",
    "        dd[w] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "dl.options.mimic_seaborn()\n",
    "context = dl.nb.Context('stream_demo')\n",
    "dl.nb.RcWidget(context)\n",
    "dl.nb.LabelWidget(2, 2, context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sp = dl.plotting.Subplotter(2, 2, context)\n",
    "actual = np.array([dd[w] for w in sw])\n",
    "errors = []\n",
    "\n",
    "for i in range(1, 4):\n",
    "    cm = dl.perf.CountMinSketch(depth=5 * 2 ** i,\n",
    "                                width=20 * 2 ** i)\n",
    "\n",
    "    for w in words:\n",
    "        cm.add(words_dict.add_or_get(w.lower()))\n",
    "\n",
    "    estimates = np.array([cm.estimate_count(words_dict.add_or_get(w))\n",
    "                        for w in sw])\n",
    "    diff = estimates - actual\n",
    "    errors.append(diff)\n",
    "\n",
    "    if i > 1:\n",
    "        sp.next_ax()\n",
    "        \n",
    "    sp.ax.hist(diff, normed=True,\n",
    "            bins=dl.stats.sqrt_bins(actual))\n",
    "    sp.label()\n",
    "\n",
    "sp.next_ax().boxplot(errors)\n",
    "sp.label()\n",
    "HTML(sp.exit())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
